\documentclass[t,12pt,aspectratio=169]{beamer} % 16:9 宽屏比例，适合现代投影
\usepackage{ctex} % 中文支持
\usepackage{amsmath, amssymb} % 数学公式与符号
\usepackage{graphicx}
\usepackage{url}
\usepackage{verbatim}

% 主题设置（推荐简洁风格）
\usetheme{Madrid}
\usecolortheme{default} % 可选：seahorse, beaver, dolphin 等

\title{R语言统计入门第12章：线性模型}
\author{PD ET AL}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

\begin{frame}
  \titlepage
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{目录}

\begin{enumerate}
\item[12.1.] 多项式回归
\item[12.2.] 过原点的回归分析
\item[12.3.] 设计矩阵与虚拟变量
\item[12.4.] 分组数据的线性模型
\item[12.5.] 交互效应
\item[12.6.] 可重复的双因素方差分析
\item[12.7.] 协方差分析
\item[12.8.] 模型诊断

\end{enumerate}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.1.1. 多项式回归 Polynomial regression}

\begin{itemize}
\item 问题：对 \,{\color{blue}\verb| cystfibr |} 数据，用身高以及身高的平方来解释肺功能。

\item 解答：考虑模型 $\boxed{\text{pemax} = \alpha+\beta_1 \text{height} + \beta_{11}\text{height}^2+\varepsilon}$.

{\small\color{blue}
\begin{verbatim}
> lm03<-lm(pemax~height+I(height^2))
> summary(lm03)

Call: lm(formula = pemax ~ height + I(height^2))

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept) 615.36248  240.95580   2.554   0.0181 *
height       -8.08324    3.32052  -2.434   0.0235 *
I(height^2)   0.03064    0.01126   2.721   0.0125 *
\end{verbatim}
}    

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.1.2. 画出回归曲线、置信带和预测带（程序）}

{\color{blue}
\begin{verbatim}
> pred.frame<-data.frame(height=seq(110,180,2))
> lm.pemax.hq<-lm(pemax~height+I(height^2))
> pp<-predict(lm.pemax.hq,newdata=pred.frame,interval='pred')
> pc<-predict(lm.pemax.hq,newdata=pred.frame,interval='conf')
> plot(height,pemax,ylim=c(0,200))
> matlines(pred.frame$height,pp,lty=c(1,2,2),col='red')
> matlines(pred.frame$height,pc,lty=c(1,2,3),col='blue')
\end{verbatim}
}    

\begin{itemize}
\item[1.] 变量 \,{\color{blue}\verb|pred.frame|} 是用来计算其回归值的自变量的等差数列。
\item[2.] 函数 \,{\color{blue}\verb|predict()|} 计算预测区间和置信区间。
\item[3.] 函数 \,{\color{blue}\verb|matlines()|} 可以同时画出若干个函数的图像。
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.1.3. 画出回归曲线、置信带和预测带（图像） }

     \begin{figure}
     \centering
     \includegraphics[height=0.65\textheight, width=0.9\textwidth]{cystfibr-pemax-height-quadratic.png}
     \caption{Quadratic regression with confidence and prediction limits. }
     \end{figure}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.2.1. 过原点的回归 Regression through the origin}

\begin{itemize}
\item[1.] 问题：什么是过原点的线性模型？
\item 解答：就是截距项为零的线性模型，
\[ y=\beta_1x_1 + \cdots + \beta_px_p + \varepsilon. \]

\vspace{0.3cm}

\item[2.] 模拟一个过原点的一元线性回归模型。
\item 解答：考虑模型 $y=2x+\varepsilon$.
{\color{blue}
\begin{verbatim}
> x <- runif(20)
> y <- 2*x+rnorm(20,0,0.3)
> summary(lm(y~x))
\end{verbatim}
}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.2.2. 过不过原点的模型比较}

\begin{itemize}
\item  问题：比较不过原点与过原点的两个模型的方差分析表：
{\color{blue}
\begin{verbatim}
> anova(lm(y~x))
> anova(lm(y~x-1))
\end{verbatim}
}

\item 解答：方差分析是研究在极小模型的残差平方和中，理论模型能解释多少。样本均值 $\bar{y}$ 是 $\alpha$ 的估计。

\begin{center}
\begin{tabular}{|c|c|c|c|}  \hline 
 		& 理论模型 	& 极小模型					& 总离差平方和  \\ \hline 
不过原点 	& $y=\alpha+\beta x+ \varepsilon$ & $y=\alpha + \varepsilon$	& $SST=\sum (y_i-\bar{y})^2 $   \\ \hline 
过原点 	& $y= \beta x+ \varepsilon$ & $y= \varepsilon$ & $SST=\sum y_i^2 $   \\ \hline 
\end{tabular}
\end{center}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.3.1. 设计矩阵与虚拟变量 Design matrices }

\begin{itemize}
\item 问题：使用 \,{\color{blue}\verb+model.matrix()+} 这个函数得到设计矩阵。如果自变量里有因子型变量，会做如何处理？

\item 解答：因子型数据会用虚拟变量 (dummy variables) 来表示。

{\color{blue}
\begin{verbatim}
> model.matrix(pemax~height+weight)
> attach(red.cell.folate)
> model.matrix(folate~ventilation)
   (Intercept) ventilationN2O+O2,op ventilationO2,24h
1            1                    0                 0
2            1                    0                 0
...
21           1                    0                 1
22           1                    0                 1
\end{verbatim}
}


\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.3.2. 虚拟变量 dummy variables}

\begin{itemize}
\item 问题：定义虚拟变量的 treatment contrasts 方法，具体是怎么进行的？

%对分组数据，定义虚拟变量的方法有哪些？

\item 解答：把第一组当作未作处理的组，回归系数直接显示了其它组相比第一组的处理效果。

> summary(lm(folate~ventilation))

{\color{blue}
\begin{verbatim}
...
Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)            316.62      16.16  19.588 4.65e-14 ***
ventilationN2O+O2,op   -60.18      22.22  -2.709   0.0139 *  
ventilationO2,24h      -38.62      26.06  -1.482   0.1548    
...
\end{verbatim}
}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.4.1. 分组数据的线性模型 Linearity over groups}

\begin{itemize}
\item 问题：如何理解线性回归模型是单因素方差分析模型的子模型？

\item 解答：We thus have two alternative models for the same data. Both belong to the class of linear models that lm is capable of handling. The linear regression model is a submodel of the model for one-way analysis of variance because the former can be obtained {\color{blue}by placing restrictions on the parameters of the latter (namely that the true group means lie on a straight line)}.




\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.4.2. 年龄对胰蛋白酶浓度的方差分析（1）}

\begin{itemize}
\item 问题：数据框 \,{\color{blue}\verb+fake.trypsin+} 包含271行3列，变量分别为
\begin{itemize}
\item 免疫胰蛋白酶浓度 \,{\color{blue}\verb+trypsin+}
\item 年龄分组 \,{\color{blue}\verb+grp+}
\item 年龄分组因子型 \,{\color{blue}\verb+grpf+}
\end{itemize}

数据是根据每组的均值与方差模拟生成的。分别对数值型年龄和因子型年龄进行单因素方差分析。

\item 解答：

{\color{blue}
\begin{verbatim}
> attach(fake.trypsin)
> anova(lm(trypsin~grp))
> anova(lm(trypsin~grpf))
\end{verbatim}
}


\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.4.3. 年龄对胰蛋白酶浓度的方差分析（2）}

\begin{itemize}
\item 问题：用 \,{\color{blue}\verb+anova()+} 函数来比较两个线性模型，得出什么结论？
{\color{blue}
\begin{verbatim}
> model1 <- lm(trypsin~grp)      #linear model
> model2 <- lm(trypsin~grpf)     #one-way anova
> anova(model1,model2)
> anova(lm(trypsin~grp+grpf))
\end{verbatim}
}

\item 解答：
\begin{itemize}
\item We see that the model reduction has a nonsignificant p-value and hence that \,{\color{blue}\verb+model2+} does not fit data significantly better than \,{\color{blue}\verb+model1+}.

\item This technique works only when one model is a submodel of the other, which is the case here {\color{red}since the linear model is defined by a restriction on the group means}.
\end{itemize}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.4.4. 胰蛋白酶浓度的分组数据（图形）}

     \begin{figure}
     \centering
     \includegraphics[height=0.65\textheight, width=0.9\textwidth]{figure-12-4-1.png}
     \caption{胰蛋白酶浓度的分组数据（伪）}
     \end{figure}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.4.5. 胰蛋白酶浓度的分组数据（程序）}

{\color{blue}
\begin{verbatim}
> xbar.trypsin<-tapply(trypsin,grpf,mean)
> stripchart(trypsin~grp,method='jitter',jitter=0.1,vertical=T, 
+ pch=20,col='blue')
> lines(1:6,xbar.trypsin,type='b',pch=1,cex=3,lty=3,col='red')

\end{verbatim}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.4.6. 年龄对胰蛋白酶浓度的方差分析（3）}

\begin{itemize}
\item 问题：如何直接使用各组数据的均值、标准差和样本容量进行方差分析？

\item 解答一：以每组观测数目为权重，对每组均值进行加权回归分析。

{\color{blue}
\begin{verbatim}
> n <- c(32,137,38,44,16,4)
> tryp.mean <- c(128,152,194,207,215,218)
> tryp.sd <-c(50.9,58.5,49.3,66.3,60,14)
> gr<-1:6
> anova(lm(tryp.mean~gr+factor(gr),weights=n))
> anova(lm(tryp.mean~gr,weights=n))
\end{verbatim}
}
\item 解答二：直接计算残差平方和 $\text{SSE}$ 和自由度 $\text{df}$, 然后计算 $\text{MSE}=\text{SSE}/\text{df}$. 最后计算 $F=\text{MSR}/\text{MSE}$. 
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.5.1. 交互效应 Interactions}

\begin{itemize}
\item 问题：交互效应试图刻画什么？
\item 解答：一个变量受另一个变量变化带来的影响程度。

\vspace{0.3cm}

\item 问题：根据两个变量分别是数值型还是因子型，描述交互项的总效应。 
\item 解答：
\begin{itemize}
\item Interaction between two factors: consider all possible {\color{blue}combinations of levels}.
\item Interaction between a factor and a numeric variable: {\color{blue}linear effects} of the continuous variable, {\color{blue}with different slopes} within each group defined by the factor.
\item Interaction between two continuous variables: {\color{blue}a new regression variable} that is the product of the two variables.
\end{itemize}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.6.1. 双因素方差分析 Two-way ANOVA with replication}

\begin{itemize}
\item 问题：数据框 \,{\color{blue}\verb+coking+} 包含了18行煤炭炼制焦炭的数据，用{\color{blue}烤箱宽度}和{\color{blue}温度}来描述{\color{blue}焦化时间}。分析交互项是否显著。

\item 解答：交互项是显著的。

{\small\color{blue}
\begin{verbatim}
> anova(lm(time~width*temp,data=coking))
Analysis of Variance Table

Response: time
           Df Sum Sq Mean Sq F value   Pr(>F)    
width       2 123.14   61.57  222.10 3.31e-10 ***
temp        1  17.21   17.21   62.08 4.39e-06 ***
width:temp  2   5.70    2.85   10.28   0.0025 ** 
Residuals  12   3.33    0.28                     
\end{verbatim}
}


\end{itemize}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.6.2. 煤炭的炼焦时间的双因素分析}

\begin{itemize}
\item 问题：煤炭的炼焦时间受到烤箱宽度和温度这两个因素的影响。计算每组的平均焦化时间。

\item 解答：高温炉和低温炉对炼焦时间的影响的差异，随着炉宽的增加而增加。因此需要考虑交互效应。

{\color{blue}
\begin{verbatim}
> tapply(time,list(width,temp),mean)
       1600    1900
4   3.06667 2.30000
8   7.16667 5.53333
12 10.80000 7.33333
\end{verbatim}
}

\end{itemize}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.7.1. 协方差分析 Analysis of covariance}

\begin{itemize}
\item 数据：数据框 \,{\color{blue}\verb+hellung+} 包含了四膜虫细胞 Tetrahymena 生长过程的细胞直径和密度，培养液按照是否加了葡萄糖分为两种。

\item 问题：
\begin{enumerate}
\item  用一个图形描述这个数据框里的所有数据。
\item  分别对两种培养皿是的四膜虫细胞的直径(因变量)和浓度(自变量)画出回归线。
\item  使用联合模型，比较两条回归线。
\end{enumerate}

\end{itemize}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.7.2. 四膜虫细胞的直径与密度（程序）}

{\small\color{blue}
\begin{verbatim}
> par(mfrow=c(1,2))
> attach(hellung)

> plot(conc,diameter,pch=glucose)  #两组数据的散点图
> plot(conc,diameter,pch=glucose,log='xy')  #双对数图

> tethym.gluc<-hellung[glucose==1,]  #培养液加葡萄糖
> tethym.nogluc<-hellung[glucose==2,]  #培养液不加葡萄糖
> lm.gluc<-lm(log10(diameter)~log10(conc),data=tethym.gluc)
> lm.nogluc<-lm(log10(diameter)~log10(conc),data=tethym.nogluc)

> abline(lm.nogluc)  #回归直线
> abline(lm.gluc)  #回归直线
\end{verbatim}
}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.7.3. 四膜虫细胞的直径与密度（图形）}

     \begin{figure}
     \centering
     \includegraphics[height=0.65\textheight, width=0.9\textwidth]{figure-12-7-1.png}
     \caption{四膜虫细胞的直径与密度的双对数图}
     \end{figure}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.7.4. 四膜虫细胞的直径与密度（联合模型）}

\begin{itemize}
\item 问题：对整个数据集拟合模型，并检验两组样本的斜率是否相等。
\item 解答：

{\small\color{blue}
\begin{verbatim}
> lm.tethym<-lm(log10(diameter)~log10(conc)*glucose)
> summary(lm.tethym)
> anova(lm.tethym)
\end{verbatim}
}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.8.1. 模型诊断 Diagnostics}

\begin{itemize}
\item[1.] 问题：什么是模型诊断？
\item 解答：评估模型假设是否成立，是否存在异常值。

\vspace{0.3cm}

\item[2.] 问题：对 thuesen 数据的回归模型进行诊断。

\item 解答：分别画出残差图、残差的QQ图、标准化残差图、库克距离图。
{\color{blue}
\begin{verbatim}
> attach(thuesen)
> options(na.action='na.exclude')
> lm.velo<-lm(short.velocity~blood.glucose)
> opar<-par(mfrow=c(2,2),mex=0.6,mar=c(4,4,3,2)+0.3)
> plot(lm.velo,which=1:4)
> par(opar)
\end{verbatim}
}

\end{itemize}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{12.8.2. 回归模型的诊断：残差图}

 \begin{figure}
 \centering
 \includegraphics[height=0.6\textheight, width=0.8\textwidth]{figure-12-8-1.png}
 \caption{Regression diagnostics.}
 \end{figure}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\end{document}
